import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
df=pd.read_csv('delivery.csv')
print(df.info())
<class 'pandas.core.frame.DataFrame'> RangeIndex: 45593 entries, 0 to 45592 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ID 45593 non-null object 1 Delivery_person_ID 45593 non-null object 2 Delivery_person_Age 45593 non-null int64 3 Delivery_person_Ratings 45593 non-null float64 4 Restaurant_latitude 45593 non-null float64 5 Restaurant_longitude 45593 non-null float64 6 Delivery_location_latitude 45593 non-null float64 7 Delivery_location_longitude 45593 non-null float64 8 Type_of_order 45593 non-null object 9 Type_of_vehicle 45593 non-null object 10 Time_taken(min) 45593 non-null int64 dtypes: float64(5), int64(2), object(4) memory usage: 3.8+ MB None
df.shape
(45593, 11)
df.dtypes
ID object Delivery_person_ID object Delivery_person_Age int64 Delivery_person_Ratings float64 Restaurant_latitude float64 Restaurant_longitude float64 Delivery_location_latitude float64 Delivery_location_longitude float64 Type_of_order object Type_of_vehicle object Time_taken(min) int64 dtype: object
df.isnull().sum()
ID 0 Delivery_person_ID 0 Delivery_person_Age 0 Delivery_person_Ratings 0 Restaurant_latitude 0 Restaurant_longitude 0 Delivery_location_latitude 0 Delivery_location_longitude 0 Type_of_order 0 Type_of_vehicle 0 Time_taken(min) 0 dtype: int64
Dataset doesnot contain duplicate or null values
df.sample(frac=1)
| ID | Delivery_person_ID | Delivery_person_Age | Delivery_person_Ratings | Restaurant_latitude | Restaurant_longitude | Delivery_location_latitude | Delivery_location_longitude | Type_of_order | Type_of_vehicle | Time_taken(min) | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 27098 | B815 | COIMBRES14DEL01 | 21 | 4.9 | 11.003681 | 76.975525 | 11.113681 | 77.085525 | Snack | scooter | 27 |
| 18223 | B248 | MYSRES09DEL02 | 31 | 4.9 | 12.323194 | 76.630583 | 12.343194 | 76.650583 | Snack | motorcycle | 32 |
| 21065 | B4EC | MYSRES08DEL03 | 38 | 5.0 | 12.297954 | 76.665169 | 12.387954 | 76.755169 | Meal | electric_scooter | 43 |
| 36111 | 5A80 | JAPRES16DEL03 | 39 | 4.3 | 26.849596 | 75.800512 | 26.939596 | 75.890512 | Buffet | scooter | 34 |
| 17974 | C76A | LUDHRES15DEL01 | 37 | 4.4 | 30.899584 | 75.809346 | 30.909584 | 75.819346 | Meal | motorcycle | 39 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 23182 | B48B | MUMRES03DEL02 | 26 | 4.7 | 19.223840 | 72.841347 | 19.303840 | 72.921347 | Drinks | motorcycle | 31 |
| 29111 | 1EB6 | HYDRES15DEL02 | 29 | 4.6 | 17.459710 | 78.368855 | 17.479710 | 78.388855 | Buffet | motorcycle | 19 |
| 10137 | 8911 | CHENRES03DEL01 | 38 | 4.4 | 13.091809 | 80.219104 | 13.161809 | 80.289104 | Drinks | motorcycle | 33 |
| 36862 | D5D1 | AURGRES06DEL02 | 26 | 4.6 | 19.874449 | 75.360232 | 19.954449 | 75.440232 | Snack | motorcycle | 20 |
| 2331 | 360F | RANCHIRES02DEL02 | 24 | 5.0 | 0.000000 | 0.000000 | 0.020000 | 0.020000 | Buffet | motorcycle | 17 |
45593 rows × 11 columns
Calculation of distance between the restaurant and delivery location using using the haversine formula
# Set the earth's radius (in kilometers)
R = 6371
# Convert degrees to radian
def deg_to_rad(degrees):
return degrees * (np.pi/180)
# Function to calculate the distance between two points using the haversine formula
def distcalculate(lat1, lon1, lat2, lon2):
d_lat = deg_to_rad(lat2-lat1)
d_lon = deg_to_rad(lon2-lon1)
a = np.sin(d_lat/2)**2 + np.cos(deg_to_rad(lat1)) * np.cos(deg_to_rad(lat2)) * np.sin(d_lon/2)**2
c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
return R * c
# Calculate the distance between each pair of points
df['distance'] = np.nan
for i in range(len(df)):
df.loc[i, 'distance'] = distcalculate(df.loc[i, 'Restaurant_latitude'],
df.loc[i, 'Restaurant_longitude'],
df.loc[i, 'Delivery_location_latitude'],
df.loc[i, 'Delivery_location_longitude'])
df.sample(frac=1)
| ID | Delivery_person_ID | Delivery_person_Age | Delivery_person_Ratings | Restaurant_latitude | Restaurant_longitude | Delivery_location_latitude | Delivery_location_longitude | Type_of_order | Type_of_vehicle | Time_taken(min) | distance | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 20015 | 82CE | COIMBRES010DEL02 | 39 | 4.1 | 11.003008 | 76.975440 | 11.133009 | 77.105440 | Meal | scooter | 47 | 20.253789 |
| 38820 | 3FB1 | VADRES11DEL01 | 27 | 4.6 | 22.308096 | 73.167753 | 22.348096 | 73.207753 | Drinks | scooter | 25 | 6.058919 |
| 20575 | 2.5E+05 | SURRES06DEL01 | 30 | 3.8 | 21.185047 | 72.808590 | 21.295047 | 72.918590 | Snack | motorcycle | 40 | 16.720679 |
| 35700 | 97A7 | PUNERES09DEL02 | 21 | 4.6 | 18.536562 | 73.896485 | 18.556562 | 73.916485 | Meal | scooter | 27 | 3.064487 |
| 8685 | 4272 | SURRES03DEL02 | 38 | 4.5 | 21.186884 | 72.793616 | 21.236884 | 72.843616 | Drinks | scooter | 25 | 7.600984 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 31017 | 9067 | INDORES02DEL02 | 22 | 5.0 | 22.651847 | 75.881991 | 22.671847 | 75.901991 | Drinks | scooter | 18 | 3.026096 |
| 34025 | 7806 | JAPRES18DEL01 | 31 | 4.3 | 26.913987 | 75.752891 | 26.953987 | 75.792891 | Buffet | motorcycle | 39 | 5.958760 |
| 25563 | 25B8 | COIMBRES12DEL03 | 30 | 4.8 | 11.000762 | 76.981876 | 11.060762 | 77.041876 | Meal | scooter | 28 | 9.348449 |
| 21031 | 74FA | HYDRES19DEL01 | 30 | 4.5 | 17.458998 | 78.500366 | 17.568998 | 78.610366 | Buffet | electric_scooter | 31 | 16.901691 |
| 11322 | 8F57 | INDORES07DEL02 | 23 | 4.6 | 22.722634 | 75.886959 | 22.742634 | 75.906959 | Buffet | scooter | 34 | 3.025377 |
45593 rows × 12 columns
figure = px.scatter(data_frame = df,
x="distance",
y="Time_taken(min)",
size="Time_taken(min)",
trendline="ols",
title = "Relationship Between Distance and Time Taken")
figure.show()
There is a consistent relationship between the time taken and the distance travelled to deliver the food. It means that most delivery partners deliver food within 25-30 minutes, regardless of distance.
figure = px.scatter(data_frame = df,
x="Delivery_person_Age",
y="Time_taken(min)",
size="Time_taken(min)",
color = "distance",
trendline="ols",
title = "Relationship Between Time Taken and Age")
figure.show()
There is a linear relationship between the time taken to deliver the food and the age of the delivery partner. It means young delivery partners take less time to deliver the food compared to the elder partners.
figure = px.scatter(data_frame = df,x="Delivery_person_Ratings",y="Time_taken(min)",size="Time_taken(min)",color = "distance",
trendline="ols",
title = "Relationship Between Time Taken and Ratings")
figure.show()
There is an inverse linear relationship between the time taken to deliver the food and the ratings of the delivery partner. It means delivery partners with higher ratings take less time to deliver the food compared to partners with low ratings
fig = px.box(df,x="Type_of_vehicle",y="Time_taken(min)", color="Type_of_order")
fig.show()
there is not much difference between the time taken by delivery partners depending on the vehicle they are driving and the type of food they are delivering.
The features that contribute most to the food delivery time based on our analysis are:
-age of the delivery partner
-ratings of the delivery partner
-distance between the restaurant and the delivery location
#splitting data
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score
from math import sqrt
x = df[["Delivery_person_Age","Delivery_person_Ratings","distance"]]
y = df["Time_taken(min)"]
x_train, x_test, y_train, y_test = train_test_split(x, y,test_size=0.2,random_state=42)
print(x.shape)
print(y.shape)
(45593, 3) (45593,)
x_train.head(5)
| Delivery_person_Age | Delivery_person_Ratings | distance | |
|---|---|---|---|
| 29044 | 37 | 4.8 | 1.514805 |
| 41736 | 28 | 4.2 | 4.663310 |
| 17874 | 24 | 4.6 | 10.586541 |
| 42093 | 34 | 4.0 | 6.058874 |
| 22952 | 28 | 4.8 | 17.297866 |
y_train.head(5)
29044 28 41736 16 17874 28 42093 32 22952 26 Name: Time_taken(min), dtype: int64
Random Forest Regressor
from sklearn.ensemble import RandomForestRegressor
rfr=RandomForestRegressor()
rfr.fit(x_train,y_train)
RandomForestRegressor()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
RandomForestRegressor()
y_pred=rfr.predict(x_test)
print("Mean Absolute Error =",mean_absolute_error(y_pred,y_test))
print("Mean Squarred Error =",mean_squared_error(y_pred,y_test))
print("Root Mean Square Error =",sqrt(mean_squared_error(y_pred,y_test)))
Mean Absolute Error = 6.338063968296876 Mean Squarred Error = 66.69381612232372 Root Mean Square Error = 8.166628197874795
XG Boost Regressor
from xgboost import XGBRegressor
xgbr=XGBRegressor()
xgbr.fit(x_train,y_train)
XGBRegressor(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=None, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=None, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
n_estimators=100, n_jobs=None, num_parallel_tree=None,
predictor=None, random_state=None, ...)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. XGBRegressor(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=None, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=None, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
n_estimators=100, n_jobs=None, num_parallel_tree=None,
predictor=None, random_state=None, ...)y_pred=xgbr.predict(x_test)
print("Mean Absolute Error =",mean_absolute_error(y_pred,y_test))
print("Mean Squarred Error =",mean_squared_error(y_pred,y_test))
print("Root Mean Square Error =",sqrt(mean_squared_error(y_pred,y_test)))
Mean Absolute Error = 5.825209372163093 Mean Squarred Error = 55.21338319296294 Root Mean Square Error = 7.430570852428697
print("Food Delivery Time Prediction")
a = int(input("Age of Delivery Partner: "))
b = float(input("Ratings of Previous Deliveries: "))
c = int(input("Total Distance: "))
features = np.array([[a, b, c]])
print("Predicted Delivery Time in Minutes(RFR) = ", rfr.predict(features))
print("Predicted Delivery Time in Minutes(XGBR) = ", xgbr.predict(features))
Food Delivery Time Prediction Age of Delivery Partner: 24 Ratings of Previous Deliveries: 3 Total Distance: 30 Predicted Delivery Time in Minutes(RFR) = [35.7] Predicted Delivery Time in Minutes(XGBR) = [35.189297]
C:\Users\rajes\AppData\Roaming\Python\Python311\site-packages\sklearn\base.py:439: UserWarning: X does not have valid feature names, but RandomForestRegressor was fitted with feature names
import pickle
with open('rfr_model.pkl', 'wb') as f:
pickle.dump(rfr, f)
with open('xgbr_model.pkl', 'wb') as f:
pickle.dump(xgbr, f)